# Loading the data using R provided
library(foreign)
read.dct <- function(dct, labels.included = "yes") {
temp <- readLines(dct)
temp <- temp[grepl("_column", temp)]
switch(labels.included,
yes = {
pattern <- "_column\\(([0-9]+)\\)\\s+([a-z0-9]+)\\s+(.*)\\s+%([0-9]+)[a-z]\\s+(.*)"
classes <- c("numeric", "character", "character", "numeric", "character")
N <- 5
NAMES <- c("StartPos", "Str", "ColName", "ColWidth", "ColLabel")
},
no = {
pattern <- "_column\\(([0-9]+)\\)\\s+([a-z0-9]+)\\s+(.*)\\s+%([0-9]+).*"
classes <- c("numeric", "character", "character", "numeric")
N <- 4
NAMES <- c("StartPos", "Str", "ColName", "ColWidth")
})
temp_metadata <- setNames(lapply(1:N, function(x) {
out <- gsub(pattern, paste("\\", x, sep = ""), temp)
out <- gsub("^\\s+|\\s+$", "", out)
out <- gsub('\"', "", out, fixed = TRUE)
class(out) <- classes[x] ; out }), NAMES)
temp_metadata[["ColName"]] <- make.names(gsub("\\s", "", temp_metadata[["ColName"]]))
temp_metadata
}
read.dat <- function(dat, metadata_var, labels.included = "yes") {
read.fwf(dat, widths = metadata_var[["ColWidth"]], col.names = metadata_var[["ColName"]])
}
GSS_metadata <- read.dct("GSS.dct")
GSS_ascii <- read.dat("GSS.dat", GSS_metadata)
attr(GSS_ascii, "col.label") <- GSS_metadata[["ColLabel"]]
GSS <- GSS_ascii
head(GSS)
# lowercase the GSS column names to match mappings values provided
for( i in colnames(GSS)){
colnames(GSS)[which(colnames(GSS)==i)] = tolower(i)
}
head(GSS)
summary(GSS)
# first, map column values to human readable EXCEPT for continous features listed below in skip_cols
code_mappings = read.table(file='data/code_mappings_tab_delim.txt', sep='\t', header=TRUE)
head(code_mappings)
# loop through each column, and map the code to the text value
# skip listed columns because these have typed answers
library(dplyr)
GSS['jobsat_num'] = GSS['satjob']
skip_cols = data.frame("jobsat_num","year","age","educ","childs","id_","hrs1","hrs2","hompop","babies","teens","preteen","adults","unrelat","earnrs")
columns_gss = names(GSS)
for (i in columns_gss){
print(i)
if (i %in% unlist(skip_cols)) {
print('skipping')
next
}
map_df = data.frame(code_mappings$Code[which(code_mappings$Variable.Name==i)],code_mappings$Label[which(code_mappings$Variable.Name==i)])
names(map_df)[1] <- i
names(map_df)[2] <- "to"
GSS <- merge(x=GSS,y=map_df,by=i,all.x=TRUE,all.y=TRUE)
GSS[i] = GSS['to']
GSS['to'] = NULL
print("done")
}
# make sure it worked
print(head(GSS))
# # change column names using:
column_name_mappings = read.table(file='data/column_name_mapping_tab_delim.txt', sep='\t', header=TRUE)
head(column_name_mappings)
# loop through each column, and map the code to the text value
columns_gss = names(GSS)
for (i in columns_gss){
new_column_name = toString(column_name_mappings$Label[which(column_name_mappings$Name==i)][1])
names(GSS)[names(GSS) == i] <- new_column_name
}
print(head(GSS))
# split out factors from continuous predictors:
# predictors - factors
predictors_factors = data.frame(
"Was r born in this country",
"Region of interview",
"Does r supervise others at work in main job",
"Political party affiliation",
"Reason not living with parents",
"Living with parents when 16 yrs old",
"Geographic mobility since age 16",
"Condition of health",
"How often does r find work stressful",
"Region of residence, age 16",
"Type of place lived in when 16 yrs old",
"Rs industry code (naics 2007)",
"Rs census occupation code (2010)",
"R self-emp or works for somebody",
"Ever work as long as one year",
"Labor force status",
"Marital status",
"Ever been divorced or separated",
"Spouse labor force status",
"Race of respondent",
"Respondents sex",
"Rs highest degree",
"Respondents income",
"Total family income",
"Number of employees: rs work site",
"Size of place in 1000s"
)
continuous_factors = data.frame(
"Age of respondent",
"How many in family earned money",
"Number in household not related",
"Household members 18 yrs and older",
"Household members 13 thru 17 yrs old",
"Household members 6 thru 12 yrs old",
"Household members less than 6 yrs old",
"Number of persons in household",
"Number of hours usually work a week",
"Number of hours worked last week",
"Number of children",
"Highest year of school completed"
)
# time series fields
time_fields = data.frame("Gss year for this respondent")
id_fields = data.frame("Respondent id number")
# responses
responses = data.frame(
"Job or housework", # satisfaction with job
"Satisfaction with financial situation",
"General happiness"
)
# unknown:
# - Src beltcode
# - Expanded norc size code
GSS['counter'] = 1
# turn into NANs: "Not applicable" and "No answer" for all columns
library(dplyr)
GSS = na_if(GSS, "Not applicable")
GSS = na_if(GSS, "No answer")
GSS = na_if(GSS, "Dont know")
GSS = na_if(GSS, "Refused")
GSS = na_if(GSS, "No issp")
GSS = na_if(GSS, "Cant choose")
# turn into NANs:
## 0 age, negative values in any of the continuous_factors
## 9, 99 or 98 in any of the continuous variables - these values map to 'No answer' and 'Dont know'
## Important: this cannot be done for Respondent ID because that makes null IDS, which is not valid
col_names = names(GSS)
for (i in col_names) {
colname = toString(i)
if(colname == 'Respondent id number'){
next
}
GSS[colname] = na_if(GSS[colname], "9")
GSS[colname] = na_if(GSS[colname], "99")
GSS[colname] = na_if(GSS[colname], "98")
GSS[colname] = na_if(GSS[colname], "13")
}
GSS$"Age of respondent" = na_if(GSS$"Age of respondent",0)
library(expss) # for using the lt() function below, means "less than"
for (i in continuous_factors) {
colname = toString(i)
GSS[colname] = na_if(GSS[colname], lt(0))
}
# remove NA responses
GSS = GSS[!is.na(GSS$"Job or housework"), ]
# Look at that cleaned data!
head(GSS)
library(ggplot2)
# make the graphs small
options(repr.plot.width=5, repr.plot.height=5)
# counts
ggplot(GSS, aes(x=GSS$'Job or housework',na.rm = TRUE)) +
geom_bar( na.rm = TRUE)+
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("Distribution of job satisfaction")
# convert factors predictors into factors
for (i in predictors_factors){
colname = toString(i)
GSS[[colname]] = factor(GSS[[colname]])
}
# take a look at them
library(ggplot2)
library(Hmisc)
options(repr.plot.width=4, repr.plot.height=4)
for (i in predictors_factors){
print(i)
plot = ggplot(GSS, aes(x=GSS[[toString(i)]])) +
geom_bar()+
xlab(i) +
scale_x_discrete(label=abbreviate)+
theme(axis.text.x = element_text(angle = 90))
print(plot)
}
# drop unused levels to clean up
GSS = droplevels(GSS)
# review predictors
summary(GSS$"Job or housework")
summary(GSS$"Satisfaction with financial situation")
summary(GSS$"General happiness")
# Plot continuous predictors
for (i in continuous_factors) {
print(i)
colname = toString(i)
print(summary(GSS[colname]))
print(hist(GSS[colname]))
}
ggplot(GSS, aes(x=`Number of hours usually work a week`)) + geom_histogram()
# Contigency tables
# Age by job satisfaction
xtabs(counter ~ `Job or housework`+`Age of respondent`, data=GSS)
No rows removed (yet)
We changed the values of "Not applicable" and "No Answer" and "Dont know" to R's encoded NAs
#*** fit ordinal logistic regression on a few easy variables just to see if it's the right direction
# not final model, just a sandbox
library(MASS)
olrmod_job <- polr(`Job or housework` ~
`R self-emp or works for somebody` +
`Respondents income` +
`How often does r find work stressful` +
`Number of hours usually work a week`, data = GSS, Hess = TRUE)
summary(olrmod_job)
#*** p values for ordinal logistic regression
summary_table <- coef(summary(olrmod_job))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
# happiness response with job predictors
library(MASS)
olr_happiness_job <- polr(`General happiness` ~
`R self-emp or works for somebody` +
`Respondents income` +
`How often does r find work stressful` +
`Number of hours usually work a week`, data = GSS, Hess = TRUE)
print(summary(olr_happiness_job))
#*** p values for ordinal logistic regression -- THIS WORKS
summary_table <- coef(summary(olr_happiness_job))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
print(summary_table)
# happiness response with age predictor, just out of curiousity
# happiness response with job predictors
library(MASS)
olr_happiness_age <- polr(`General happiness` ~
`Age of respondent`, data = GSS, Hess = TRUE)
print(summary(olr_happiness_age))
#*** p values for ordinal logistic regression -- THIS WORKS
summary_table <- coef(summary(olr_happiness_age))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
print(summary_table)
# look at distribution of responses by year
library(tidyr)
# Plot response counts and proportions
plot_resp = function(resp) {
# count number of each response each year
counts = data.frame()
options = levels(GSS[1,resp])
for (i in 1972:2018) {
oneyear = c(i)
for (j in 1:length(options)) {
oneyear = c(oneyear,
sum(GSS[GSS[resp]==options[j] & GSS$`Gss year for this respondent`==i, "counter"], na.rm = T))
}
oneyear = c(oneyear, sum(GSS[is.na(GSS[resp]) & GSS$`Gss year for this respondent`==i, "counter"], na.rm=T))
counts = rbind(counts, oneyear)
}
options = c(options, "NA")
colnames(counts) = c("Year", options)
#print(head(counts))
counts_long <- gather(counts, Response, Count, options[1]:options[length(options)], factor_key=TRUE)
#print(head(counts_long))
# calculate proportions of responses
prop = c()
for (i in 1:nrow(counts_long)) {
prop = c(prop,
counts_long$Count[i]/sum(counts_long[counts_long$Year==counts_long[i, "Year"], "Count"]))
}
prop[is.na(prop)] = 0
counts_long$Proportion = prop
print(head(counts_long))
return(counts_long)
}
# JOB OR HOUSEWORK
jobsat_counts = plot_resp("Job or housework")
# make the graphs wider
options(repr.plot.width=16, repr.plot.height=6)
# plot absolute frequency
ggplot(jobsat_counts, aes(fill=Response, x=Year, y=Count)) +
geom_bar(position="dodge", stat="identity", width=0.9) +
ggtitle("Job/housework Satisfaction Response Count by Year") +
theme(axis.text.x = element_text(angle = 90), legend.position="bottom", plot.title = element_text(size=20))
# plot relative frequency
ggplot(jobsat_counts, aes(fill=Response, x=Year, y=Proportion)) +
geom_bar(stat="identity", width=0.9) +
ggtitle("Job/housework Satisfaction Response Proportion by Year, Including NAs") +
theme(axis.text.x = element_text(angle = 90), legend.position="bottom", plot.title = element_text(size=20))
#scale_x_discrete(breaks=seq(1970, 2020, 5))
# plot relative frequency, no NAs
jobsat_noNA = jobsat_counts[jobsat_counts$Response!="NA",]
#tail(jobsat_noNA)
# calculate proportions of responses, no NAs
prop = c()
for (i in 1:nrow(jobsat_noNA)) {
prop = c(prop,
jobsat_noNA$Count[i]/sum(jobsat_noNA[jobsat_noNA$Year==jobsat_noNA[i, "Year"], "Count"]))
}
prop[is.na(prop)] = 0
jobsat_noNA$Proportion = prop
print(head(jobsat_noNA))
ggplot(jobsat_noNA, aes(fill=Response, x=Year, y=Proportion)) +
geom_bar(stat="identity", width=0.9) +
ggtitle("Job/housework Satisfaction Response Proportion by Year, Excluding NAs") +
theme(axis.text.x = element_text(angle = 90), legend.position="bottom", plot.title = element_text(size=20))
# proportions don't change much, data may be indep of year
MVORD library: https://cran.r-project.org/web/packages/mvord/vignettes/vignette_mvord.pdf
The R package mvord implements composite likelihood estimation in the class of multivariate ordinal regression models with a multivariate probit and a multivariate logit link. A flexible modeling framework for multiple ordinal measurements on the same subject is set up, which takes into consideration the dependence among the multiple observations by employing different error structures. Heterogeneity in the error structure across the subjects can be accounted for by the package, which allows for covariate dependent error structures. In addition, different regression coefficients and threshold parameters for each response are supported. If a reduction of the parameter space is desired, constraints on the threshold as well as on the regression coefficients can be specified by the user. The proposed multivariate framework is illustrated by means of a credit risk application
even better example: https://rdrr.io/cran/mvord/man/mvord.html
library(dplyr)
names(GSS)<-make.names(names(GSS),unique = TRUE)
GSS = rename(GSS, jobsat_num = NA.)
GSS = rename(GSS, Gss.year.for.this.respondent = Gss.year.for.this.respondent.......................)
library(dplyr)
#select subset of data
subset_dat <- GSS$Gss.year.for.this.respondent %in% c("2016","2018") #"2010","2012","2014", takes too long to run
GSS_subset <- GSS[subset_dat,]
# Add columns her to use in mvord/mixor
GSS_job_sat = data.frame(subset(GSS_subset, select=c("Job.or.housework",
"jobsat_num",
"R.self.emp.or.works.for.somebody",
"Respondent.id.number",
"Gss.year.for.this.respondent",
"Respondents.income",
"How.often.does.r.find.work.stressful",
"Number.of.hours.usually.work.a.week",
"Age.of.respondent"
)))
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$Job.or.housework), ]
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$R.self.emp.or.works.for.somebody), ]
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$Respondents.income), ]
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$How.often.does.r.find.work.stressful), ]
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$Number.of.hours.usually.work.a.week), ]
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$Age.of.respondent), ]
GSS_job_sat = na.omit(GSS_job_sat)
print(head(GSS_job_sat))
# using mvord library
# only using 2016 and 2018 data where none of the values listed below are NA
# mvord is not able to handle NAs
library(mvord)
mvord_1 <- mvord(data = GSS_job_sat,
formula = MMO(Job.or.housework,Respondent.id.number,Gss.year.for.this.respondent) ~
Age.of.respondent,
error.structure = cor_ar1(~ 1),
link = mvlogit(),
PL.lag = 2)
cat("made model")
print(summary(mvord_1))
prob_job_sat = prop.table(xtabs(counter ~ Job.or.housework, data=GSS))
prob_job_sat
income_dist = aggregate(GSS$counter, by=list(Job.or.housework=GSS$Job.or.housework,Respondents.income=GSS$Respondents.income), FUN=sum)
prob_job_sat_income = prop.table(xtabs(x ~ Job.or.housework+Respondents.income, data=income_dist))
prob_job_sat_income
summary(prob_job_sat_income) # they are independent, job satisfaction and income
options(repr.plot.width=12, repr.plot.height=12)
dotchart(prob_job_sat_income)
# mosaicplot(prob_job_sat_income)
income_dist = aggregate(GSS$counter, by=list(Job.or.housework=GSS$Job.or.housework,Respondents.income=GSS$Respondents.income), FUN=sum)
print(head(income_dist))
glm_sat_income = glm(Job.or.housework ~ Respondents.income, data=income_dist, family=binomial)
summary(glm_sat_income)
# using mixor library
library(mixor)
# mixor_df = GSS_job_sat[order(GSS_job_sat$Respondent.id.number, GSS_job_sat$Gss.year.for.this.respondent),]
mixor_df = GSS[order(GSS$Respondent.id.number, GSS$Gss.year.for.this.respondent),]
mixor_1<-mixor(jobsat_num ~ factor(Gss.year.for.this.respondent) +
# factor(R.self.emp.or.works.for.somebody),# +
factor(Respondents.income),# +
# factor(How.often.does.r.find.work.stressful),# +
# factor(Number.of.hours.usually.work.a.week),
data=mixor_df, id=Respondent.id.number)
summary(mixor_1)
# No idea why all of these are 0
# Great resource: https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/
# Plot continuous predictors
continuous_factors_plot = data.frame(
"Age.of.respondent",
"How.many.in.family.earned.money",
"Number.in.household.not.related",
"Household.members.18.yrs.and.older",
"Household.members.13.thru.17.yrs.old",
"Household.members.6.thru.12.yrs.old",
"Household.members.less.than.6.yrs.old",
"Number.of.persons.in.household",
"Total.family.income",
"Respondents.income",
"Number.of.employees..rs.work.site",
"How.often.does.r.find.work.stressful",
"Size.of.place.in.1000s",
"Number.of.hours.usually.work.a.week",
"Number.of.hours.worked.last.week",
"Number.of.children",
"Highest.year.of.school.completed",
"Gss.year.for.this.respondent"
)
GSS = GSS[!is.na(GSS$Job.or.housework), ]
options(repr.plot.width=5, repr.plot.height=5)
for (i in continuous_factors_plot) {
print(i)
plot = ggplot(GSS, aes(x = GSS[[toString(i)]],y=Job.or.housework)) +
geom_boxplot(size = .75) +
geom_jitter(alpha = .5) +
xlab(i)+
# facet_grid(Respondents.income ~ How.often.does.r.find.work.stressful, margins = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
print(plot)
}
# get total response rate for all respondents for all of time:
response_dist = aggregate(GSS$counter, by=list(Respondent.id.number=GSS$Respondent.id.number), FUN=sum)
print(hist(response_dist$x))
response_rate = aggregate(GSS$counter/33, by=list(Respondent.id.number=GSS$Respondent.id.number), FUN=sum)
mean(response_rate$x)
response_rate
# is respondent 1 the same every year?
respondent_1 = GSS[ which(GSS$Respondent.id.number==1), ]
respondent_1
# NO, trickery.
# randomly pick 1 year (after 2008 crisis just in case)
set.seed(99)
yr = sample(seq(2010, 2018, by=2), 1) # only have data every other year
yr
subset_yr <- GSS$Gss.year.for.this.respondent == yr
GSS_yr <- GSS[subset_yr,]
# split data into groups by category (otherwise models don't work)
# job-related variables
job_rel = data.frame(subset(GSS_yr, select=c(Job.or.housework,
# Does.r.supervise.others.at.work.in.main.job,
How.often.does.r.find.work.stressful,
R.self.emp.or.works.for.somebody,
Labor.force.status,
Respondents.income,
Number.of.employees..rs.work.site,
# Rs industry code (naics 2007), # too many categories
# Rs census occupation code (2010), # too many categories
# Ever work as long as one year, # doesn't work, only 887 not NAs: length(job_rel$Ever.work.as.long.as.one.year[!is.na(job_rel$Ever.work.as.long.as.one.year)])
# Number of hours usually work a week, # doesn't work, only 40 not NAs
Number.of.hours.worked.last.week)))
# family-related variables
job_family = data.frame(subset(GSS_yr, select=c(Job.or.housework,
Marital.status,
Spouse.labor.force.status,
Number.of.children,
How.many.in.family.earned.money,
Number.in.household.not.related,
Household.members.18.yrs.and.older,
Household.members.13.thru.17.yrs.old,
Household.members.6.thru.12.yrs.old,
Household.members.less.than.6.yrs.old,
Number.of.persons.in.household,
Total.family.income)))
#head(job_family)
# personal history variables
job_past = data.frame(subset(GSS_yr, select=c(Job.or.housework,
# Reason.not.living.with.parents, failed algorithm did not converge
Was.r.born.in.this.country,
Living.with.parents.when.16.yrs.old,
Geographic.mobility.since.age.16,
Region.of.residence..age.16,
Type.of.place.lived.in.when.16.yrs.old,
Ever.been.divorced.or.separated))) # family?
#head(job_past)
# other variables
job_other = data.frame(subset(GSS_yr, select=c(Job.or.housework,
Region.of.interview,
Political.party.affiliation,
Condition.of.health,
Race.of.respondent,
Respondents.sex,
Rs.highest.degree,
Age.of.respondent,
Highest.year.of.school.completed)))
#head(job_other)
job_all = cbind(job_rel, job_family[,-1], job_past[,-1], job_other[,-1])
#head(job_all)
job_rel_family = cbind(job_rel, job_family[,-1])
job_rel_past = cbind(job_rel, job_past[,-1])
job_rel_other = cbind(job_rel, job_other[,-1])
print(head(job_all))
# full model - DOESN'T WORK
library(MASS)
olrmod_job_all <- polr(Job.or.housework ~ ., data = job_all, Hess = TRUE)
summary(olrmod_job_all)
After include too many, get error "attempt to find suitable starting values failed"
Maybe because:
"There is one fairly common circumstance in which both convergence problems and the Hauck-Donner phenomenon can occur. This is when the fitted probabilities are extremely close to zero or one. Consider a medical diagnosis problem with thousands of cases and around 50 binary explanatory variable (which may arise from coding fewer categorical variables); one of these indicators is rarely true but always indicates that the disease is present. Then the fitted probabilities of cases with that indicator should be one, which can only be achieved by taking βi = ∞. The result from glm will be warnings and an estimated coefficient of around +/- 10. There has been fairly extensive discussion of this in the statistical literature, usually claiming non-existence of maximum likelihood estimates; see Sautner and Duffy (1989, p. 234)."
https://stackoverflow.com/questions/55416164/ordinal-logistic-regression-in-r https://stackoverflow.com/questions/8596160/why-am-i-getting-algorithm-did-not-converge-and-fitted-prob-numerically-0-or
# construct a model for each category
# partial model: job-related variables
olrmod_job_rel <- polr(Job.or.housework ~ ., data = job_rel, Hess = TRUE)
cat("Residual Deviance: ", olrmod_job_rel$deviance, "\n")
cat("Degrees of freedom: ", olrmod_job_rel$df.residual, "\n")
cat("p value: ", pchisq(olrmod_job_rel$deviance, olrmod_job_rel$df.residual, lower.tail = FALSE))
# summary with p values for ordinal logistic regression
summary_table <- coef(summary(olrmod_job_rel))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
# partial model: family variables
olrmod_job_family <- polr(Job.or.housework ~ ., data = job_family, Hess = TRUE)
# summary incl p vals
summary_table <- coef(summary(olrmod_job_family))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
#Warning "glm.fit: fitted probabilities numerically 0 or 1 occurred"
#https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression
# partial model: personal history variables
olrmod_job_past <- polr(Job.or.housework ~ ., data = job_past, Hess = TRUE)
cat("Residual Deviance: ", olrmod_job_past$deviance, "\n")
cat("Degrees of freedom: ", olrmod_job_past$df.residual, "\n")
cat("p value: ", pchisq(olrmod_job_past$deviance, olrmod_job_past$df.residual, lower.tail = FALSE))
# summary incl p vals
summary_table <- coef(summary(olrmod_job_past))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
# partial model: other variables
olrmod_job_other <- polr(Job.or.housework ~ ., data = job_other, Hess = TRUE)
# summary incl p vals
summary_table <- coef(summary(olrmod_job_other))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
Combine job-related variables with each other category separately to compare
# full(ish) model: job-related vs. family variables
olrmod_job_rel_family <- polr(Job.or.housework ~ ., data = job_rel_family, Hess = TRUE)#, maxit = 100)
cat("Residual Deviance: ", olrmod_job_rel_family$deviance, "\n")
cat("Degrees of freedom: ", olrmod_job_rel_family$df.residual, "\n")
cat("p value: ", pchisq(olrmod_job_rel_family$deviance, olrmod_job_rel_family$df.residual, lower.tail = FALSE))
# 0/1 problem?
# summary incl p vals
summary_table <- coef(summary(olrmod_job_rel_family))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
# full(ish) model: job-related vs. past variables
olrmod_job_rel_past <- polr(Job.or.housework ~ ., data = job_rel_past, Hess = TRUE)#, maxit = 100)
cat("Residual Deviance: ", olrmod_job_rel_past$deviance, "\n")
cat("Degrees of freedom: ", olrmod_job_rel_past$df.residual, "\n")
cat("p value: ", pchisq(olrmod_job_rel_past$deviance, olrmod_job_rel_past$df.residual, lower.tail = FALSE))
# 0/1 problem?
# summary incl p vals
summary_table <- coef(summary(olrmod_job_rel_past))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
# full(ish) model: job-related vs. other variables
olrmod_job_rel_other <- polr(Job.or.housework ~ ., data = job_rel_other, Hess = TRUE)
cat("Residual Deviance: ", olrmod_job_rel_other$deviance, "\n")
cat("Degrees of freedom: ", olrmod_job_rel_other$df.residual, "\n")
cat("p value: ", pchisq(olrmod_job_rel_other$deviance, olrmod_job_rel_other$df.residual, lower.tail = FALSE))
# 0/1 problem?
# summary incl p vals
summary_table <- coef(summary(olrmod_job_rel_other))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
Hand-pick a few variables that seem most important just for funsies (yes, we probably shouldn't do this)
# hand-picked variables
jobsat_var = data.frame(subset(GSS_yr, select=c(Job.or.housework,
Respondents.income,
Number.of.hours.worked.last.week, #Number.of.hours.usually.work.a.week doen't work
How.often.does.r.find.work.stressful,
Age.of.respondent,
Condition.of.health,
Rs.highest.degree,
Respondents.sex)))
head(jobsat_var)
# full model: hand-picked variables
library(MASS)
olrmod_jobsat_var <- polr(Job.or.housework ~ ., data = jobsat_var, Hess = TRUE)
cat("Residual Deviance: ", olrmod_jobsat_var$deviance, "\n")
cat("Degrees of freedom: ", olrmod_jobsat_var$df.residual, "\n")
cat("p value: ", pchisq(olrmod_jobsat_var$deviance, olrmod_jobsat_var$df.residual, lower.tail = FALSE))
# summary with p values for ordinal logistic regression
summary_table <- coef(summary(olrmod_jobsat_var))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
# individual models: hand-picked variables
solrmod = function (data, var) {
print(var)
var = data[[var]]
olrmod_onevar <- polr(Job.or.housework ~ var, data = jobsat_var, Hess = TRUE)
# summary with p values for ordinal logistic regression
summary_table <- coef(summary(olrmod_onevar))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
print(summary_table)
cat("Residual Deviance: ", olrmod_onevar$deviance, "\n")
cat("Degrees of freedom: ", olrmod_onevar$df.residual, "\n")
cat("p value: ", pchisq(olrmod_onevar$deviance, olrmod_onevar$df.residual, lower.tail = FALSE), "\n\n")
}
for (i in 2:ncol(jobsat_var)) {
solrmod(jobsat_var, colnames(jobsat_var)[i])
}